{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "275e5a2b-10c9-4737-9789-e11fc7bb6ff1",
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline \n",
    "%config InlineBackend.figure_format = 'retina'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4e6b685e-e5ee-4a16-b757-15773d901448",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Jun 22 01:54:53 2022       \n",
      "+-----------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |\n",
      "|-------------------------------+----------------------+----------------------+\n",
      "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
      "|                               |                      |               MIG M. |\n",
      "|===============================+======================+======================|\n",
      "|   0  Tesla T4            Off  | 00000000:00:1E.0 Off |                    0 |\n",
      "| N/A   46C    P0    27W /  70W |   6041MiB / 15109MiB |      0%      Default |\n",
      "|                               |                      |                  N/A |\n",
      "+-------------------------------+----------------------+----------------------+\n",
      "                                                                               \n",
      "+-----------------------------------------------------------------------------+\n",
      "| Processes:                                                                  |\n",
      "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
      "|        ID   ID                                                   Usage      |\n",
      "|=============================================================================|\n",
      "+-----------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56bab6aa-a554-4e7c-915c-7021e740a2e5",
   "metadata": {
    "tags": []
   },
   "source": [
    "### [Numba](https://numba.pydata.org/)\n",
    "\"Numpy for GPU\" using Nvidia CUDA library"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ba1334af-3c3b-4555-ae44-db943bb7cc05",
   "metadata": {},
   "outputs": [],
   "source": [
    "from numba import cuda, float32\n",
    "import numpy as np\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2aa4d227-d0b2-42cd-a029-c7afe48abb1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"NUMBA_CUDA_USE_NVIDIA_BINDING\"] = \"1\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "61124317-6bcf-41dd-b8ca-324a7dd01e17",
   "metadata": {},
   "source": [
    "#### Setup matrix-multipication dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "96d7b0be-0aef-4d51-a869-b64842846e6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "x_h = np.arange(100000000).reshape([10000, 10000])\n",
    "y_h = np.ones([10000, 10000])\n",
    "z_h = np.zeros([10000, 10000])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c15a27b-d028-4026-98f7-072a470978da",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Slow CPU matrix multiplication"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "90e33d79-fb67-4ac5-9f45-e2035de1db04",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 5s, sys: 1.67 s, total: 1min 7s\n",
      "Wall time: 17.5 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[4.99950000e+07, 4.99950000e+07, 4.99950000e+07, ...,\n",
       "        4.99950000e+07, 4.99950000e+07, 4.99950000e+07],\n",
       "       [1.49995000e+08, 1.49995000e+08, 1.49995000e+08, ...,\n",
       "        1.49995000e+08, 1.49995000e+08, 1.49995000e+08],\n",
       "       [2.49995000e+08, 2.49995000e+08, 2.49995000e+08, ...,\n",
       "        2.49995000e+08, 2.49995000e+08, 2.49995000e+08],\n",
       "       ...,\n",
       "       [9.99749995e+11, 9.99749995e+11, 9.99749995e+11, ...,\n",
       "        9.99749995e+11, 9.99749995e+11, 9.99749995e+11],\n",
       "       [9.99849995e+11, 9.99849995e+11, 9.99849995e+11, ...,\n",
       "        9.99849995e+11, 9.99849995e+11, 9.99849995e+11],\n",
       "       [9.99949995e+11, 9.99949995e+11, 9.99949995e+11, ...,\n",
       "        9.99949995e+11, 9.99949995e+11, 9.99949995e+11]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "np.matmul(x_h, y_h, z_h)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2973a411-0fa1-4bf7-8e21-56a22d890827",
   "metadata": {},
   "source": [
    "#### Setup GPU-specific constructs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "13f43395-0595-4f8d-acd9-48965a47e4f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "TPB = 16\n",
    "\n",
    "threadsperblock = (TPB, TPB)\n",
    "blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])\n",
    "blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])\n",
    "blockspergrid = (blockspergrid_x, blockspergrid_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e19efb3-1e2d-4d10-b1f5-2e9f14bc0fd0",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Slow GPU matrix multiplication\n",
    "\n",
    "This implementation will perform poorly because individual matrix elements are reloaded multiple times into device memory (slow)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ac9674e4-56f7-44d1-8b2d-784a2847235d",
   "metadata": {},
   "outputs": [],
   "source": [
    "@cuda.jit\n",
    "def matmul(A, B, C):\n",
    "    \"\"\"Perform square matrix multiplication of C = A * B.\"\"\"\n",
    "    i, j = cuda.grid(2)\n",
    "    if i < C.shape[0] and j < C.shape[1]:\n",
    "        tmp = 0.\n",
    "        for k in range(A.shape[1]):\n",
    "            tmp += A[i, k] * B[k, j]\n",
    "        C[i, j] = tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "35438b12-ed7a-43bd-912b-cd8bba62fb06",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 24.6 s, sys: 16.2 s, total: 40.7 s\n",
      "Wall time: 41 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[4.99950000e+07, 4.99950000e+07, 4.99950000e+07, ...,\n",
       "        4.99950000e+07, 4.99950000e+07, 4.99950000e+07],\n",
       "       [1.49995000e+08, 1.49995000e+08, 1.49995000e+08, ...,\n",
       "        1.49995000e+08, 1.49995000e+08, 1.49995000e+08],\n",
       "       [2.49995000e+08, 2.49995000e+08, 2.49995000e+08, ...,\n",
       "        2.49995000e+08, 2.49995000e+08, 2.49995000e+08],\n",
       "       ...,\n",
       "       [9.99749995e+11, 9.99749995e+11, 9.99749995e+11, ...,\n",
       "        9.99749995e+11, 9.99749995e+11, 9.99749995e+11],\n",
       "       [9.99849995e+11, 9.99849995e+11, 9.99849995e+11, ...,\n",
       "        9.99849995e+11, 9.99849995e+11, 9.99849995e+11],\n",
       "       [9.99949995e+11, 9.99949995e+11, 9.99949995e+11, ...,\n",
       "        9.99949995e+11, 9.99949995e+11, 9.99949995e+11]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "x_d = cuda.to_device(x_h)\n",
    "y_d = cuda.to_device(y_h)\n",
    "z_d = cuda.to_device(z_h)\n",
    "\n",
    "matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)\n",
    "\n",
    "z_d.copy_to_host()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b0edf4c-b78a-4092-afdc-426feca25bda",
   "metadata": {},
   "source": [
    "#### Fast GPU matrix multiplication\n",
    "While a bit more complex, this implementation will perform much faster because we are loading blocks of matrix elements into device memory and using shared memory across those blocks (fast)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "7d403275-e3b9-4ef7-a264-cb8d32d5171b",
   "metadata": {},
   "outputs": [],
   "source": [
    "@cuda.jit\n",
    "def fast_matmul(A, B, C):\n",
    "    \"\"\"\n",
    "    Perform matrix multiplication of C = A * B using CUDA shared memory.\n",
    "\n",
    "    Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella\n",
    "    \"\"\"\n",
    "    # Define an array in the shared memory\n",
    "    # The size and type of the arrays must be known at compile time\n",
    "    sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)\n",
    "    sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)\n",
    "\n",
    "    x, y = cuda.grid(2)\n",
    "\n",
    "    tx = cuda.threadIdx.x\n",
    "    ty = cuda.threadIdx.y\n",
    "    bpg = cuda.gridDim.x    # blocks per grid\n",
    "\n",
    "    # Each thread computes one element in the result matrix.\n",
    "    # The dot product is chunked into dot products of TPB-long vectors.\n",
    "    tmp = float32(0.)\n",
    "    for i in range(bpg):\n",
    "        # Preload data into shared memory\n",
    "        sA[ty, tx] = 0\n",
    "        sB[ty, tx] = 0\n",
    "        if y < A.shape[0] and (tx + i * TPB) < A.shape[1]:\n",
    "            sA[ty, tx] = A[y, tx + i * TPB]\n",
    "        if x < B.shape[1] and (ty + i * TPB) < B.shape[0]:\n",
    "            sB[ty, tx] = B[ty + i * TPB, x]\n",
    "\n",
    "        # Wait until all threads finish preloading\n",
    "        cuda.syncthreads()\n",
    "\n",
    "        # Computes partial product on the shared memory\n",
    "        for j in range(TPB):\n",
    "            tmp += sA[ty, j] * sB[j, tx]\n",
    "\n",
    "        # Wait until all threads finish computing\n",
    "        cuda.syncthreads()\n",
    "    if y < C.shape[0] and x < C.shape[1]:\n",
    "        C[y, x] = tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "20b7ffa0-a28f-48a3-81b9-1d84bec8bfd9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3.41 s, sys: 1.73 s, total: 5.14 s\n",
      "Wall time: 5.14 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([[4.99928960e+07, 4.99928960e+07, 4.99928960e+07, ...,\n",
       "        4.99928960e+07, 4.99928960e+07, 4.99928960e+07],\n",
       "       [1.49990816e+08, 1.49990816e+08, 1.49990816e+08, ...,\n",
       "        1.49990816e+08, 1.49990816e+08, 1.49990816e+08],\n",
       "       [2.49990416e+08, 2.49990416e+08, 2.49990416e+08, ...,\n",
       "        2.49990416e+08, 2.49990416e+08, 2.49990416e+08],\n",
       "       ...,\n",
       "       [9.99893041e+11, 9.99893041e+11, 9.99893041e+11, ...,\n",
       "        9.99893041e+11, 9.99893041e+11, 9.99893041e+11],\n",
       "       [9.99907918e+11, 9.99907918e+11, 9.99907918e+11, ...,\n",
       "        9.99907918e+11, 9.99907918e+11, 9.99907918e+11],\n",
       "       [1.00003375e+12, 1.00003375e+12, 1.00003375e+12, ...,\n",
       "        1.00003375e+12, 1.00003375e+12, 1.00003375e+12]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "x_d = cuda.to_device(x_h)\n",
    "y_d = cuda.to_device(y_h)\n",
    "z_d = cuda.to_device(z_h)\n",
    "\n",
    "fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)\n",
    "\n",
    "z_d.copy_to_host()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e06f300-a25b-4585-b691-8eca427726bb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "reMars2022:Python",
   "language": "python",
   "name": "conda-env-reMars2022-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
